## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.33     R6_2.5.1          fastmap_1.1.1     xfun_0.41        
##  [5] cachem_1.0.8      knitr_1.45        htmltools_0.5.7   rmarkdown_2.25   
##  [9] cli_3.6.1         sass_0.4.7        jquerylib_0.1.4   compiler_4.3.2   
## [13] rstudioapi_0.15.0 tools_4.3.2       evaluate_0.23     bslib_0.5.1      
## [17] yaml_2.3.7        rlang_1.1.2       jsonlite_1.8.7
## Loading required package: ggplot2
## Loading required package: lattice
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import the Case Study 2 data set

# List files in root dir
list.files()
## [1] "CaseStudy2.html"     "CaseStudy2.pptx"     "CaseStudy2.Rmd"     
## [4] "CaseStudy2DDS.Rproj" "data"                "README.md"
# Fetch data set from local folder
df_cs2 = read.csv("data/CaseStudy2-data.csv", header = TRUE) 
df_cs2_no_attrition = read.csv("data/CaseStudy2CompSet No Attrition.csv", header = TRUE) 
df_cs2_no_salary = read.csv("data/CaseStudy2CompSet No Salary.csv", header = TRUE) 

Tidy the Case Study 2 Data Set

## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : chr  "No" "No" "No" "No" ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Sales" ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : chr  "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : chr  "Divorced" "Single" "Single" "Married" ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "No" ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
##        ID             Age         Attrition         BusinessTravel    
##  Min.   :  1.0   Min.   :18.00   Length:870         Length:870        
##  1st Qu.:218.2   1st Qu.:30.00   Class :character   Class :character  
##  Median :435.5   Median :35.00   Mode  :character   Mode  :character  
##  Mean   :435.5   Mean   :36.83                                        
##  3rd Qu.:652.8   3rd Qu.:43.00                                        
##  Max.   :870.0   Max.   :60.00                                        
##    DailyRate       Department        DistanceFromHome   Education    
##  Min.   : 103.0   Length:870         Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Class :character   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Mode  :character   Median : 7.000   Median :3.000  
##  Mean   : 815.2                      Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                      3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                      Max.   :29.000   Max.   :5.000  
##  EducationField     EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Length:870         Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Class :character   1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Mode  :character   Median :1     Median :1039.0   Median :3.000          
##                     Mean   :1     Mean   :1029.8   Mean   :2.701          
##                     3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##                     Max.   :1     Max.   :2064.0   Max.   :4.000          
##     Gender            HourlyRate     JobInvolvement     JobLevel    
##  Length:870         Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Class :character   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##  Mode  :character   Median : 66.00   Median :3.000   Median :2.000  
##                     Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##                     3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##                     Max.   :100.00   Max.   :4.000   Max.   :5.000  
##    JobRole          JobSatisfaction MaritalStatus      MonthlyIncome  
##  Length:870         Min.   :1.000   Length:870         Min.   : 1081  
##  Class :character   1st Qu.:2.000   Class :character   1st Qu.: 2840  
##  Mode  :character   Median :3.000   Mode  :character   Median : 4946  
##                     Mean   :2.709                      Mean   : 6390  
##                     3rd Qu.:4.000                      3rd Qu.: 8182  
##                     Max.   :4.000                      Max.   :19999  
##   MonthlyRate    NumCompaniesWorked    Over18            OverTime        
##  Min.   : 2094   Min.   :0.000      Length:870         Length:870        
##  1st Qu.: 8092   1st Qu.:1.000      Class :character   Class :character  
##  Median :14074   Median :2.000      Mode  :character   Mode  :character  
##  Mean   :14326   Mean   :2.728                                           
##  3rd Qu.:20456   3rd Qu.:4.000                                           
##  Max.   :26997   Max.   :9.000                                           
##  PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
##  Min.   :11.0      Min.   :3.000     Min.   :1.000            Min.   :80   
##  1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80   
##  Median :14.0      Median :3.000     Median :3.000            Median :80   
##  Mean   :15.2      Mean   :3.152     Mean   :2.707            Mean   :80   
##  3rd Qu.:18.0      3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80   
##  Max.   :25.0      Max.   :4.000     Max.   :4.000            Max.   :80   
##  StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
##  Min.   :0.0000   Min.   : 0.00     Min.   :0.000         Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000  
##  Median :1.0000   Median :10.00     Median :3.000         Median :3.000  
##  Mean   :0.7839   Mean   :11.05     Mean   :2.832         Mean   :2.782  
##  3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000  
##  Max.   :3.0000   Max.   :40.00     Max.   :6.000         Max.   :4.000  
##  YearsAtCompany   YearsInCurrentRole YearsSinceLastPromotion
##  Min.   : 0.000   Min.   : 0.000     Min.   : 0.000         
##  1st Qu.: 3.000   1st Qu.: 2.000     1st Qu.: 0.000         
##  Median : 5.000   Median : 3.000     Median : 1.000         
##  Mean   : 6.962   Mean   : 4.205     Mean   : 2.169         
##  3rd Qu.:10.000   3rd Qu.: 7.000     3rd Qu.: 3.000         
##  Max.   :40.000   Max.   :18.000     Max.   :15.000         
##  YearsWithCurrManager
##  Min.   : 0.00       
##  1st Qu.: 2.00       
##  Median : 3.00       
##  Mean   : 4.14       
##  3rd Qu.: 7.00       
##  Max.   :17.00
## [1] 870  36

##                       ID                      Age                Attrition 
##                        0                        0                        0 
##           BusinessTravel                DailyRate               Department 
##                        0                        0                        0 
##         DistanceFromHome                Education           EducationField 
##                        0                        0                        0 
##            EmployeeCount           EmployeeNumber  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender               HourlyRate           JobInvolvement 
##                        0                        0                        0 
##                 JobLevel                  JobRole          JobSatisfaction 
##                        0                        0                        0 
##            MaritalStatus            MonthlyIncome              MonthlyRate 
##                        0                        0                        0 
##       NumCompaniesWorked                   Over18                 OverTime 
##                        0                        0                        0 
##        PercentSalaryHike        PerformanceRating RelationshipSatisfaction 
##                        0                        0                        0 
##            StandardHours         StockOptionLevel        TotalWorkingYears 
##                        0                        0                        0 
##    TrainingTimesLastYear          WorkLifeBalance           YearsAtCompany 
##                        0                        0                        0 
##       YearsInCurrentRole  YearsSinceLastPromotion     YearsWithCurrManager 
##                        0                        0                        0

Transform the Case Study 2 Data Set

Visualize the Case Study 2 Data Set

Perform Exploratory Data Analysis

# Count the number of employees with attrition, we will compare with rest of sample later
sum(df_cs2$Attrition == "Yes") 
## [1] 140
### Company Profile Snapshot: Age, Gender, Education, Pay, Role ###

# Box plot age by department
box_plot_age_dept <- df_emp %>% ggplot(aes(Department, Age, fill = Department)) + geom_boxplot() + ggtitle(paste("Box plot of Age by Department, n = ", count(df_emp)))
ggplotly(box_plot_age_dept)
# Box plot years in current role by department
box_plot_yirc_dept <- df_emp %>% ggplot(aes(Department, YearsInCurrentRole, fill = Department)) + geom_boxplot() + ggtitle(paste("Box plot of Years In Current Role by Department, n = ", count(df_emp)))
ggplotly(box_plot_yirc_dept)
# Box plot age by job role
box_plot_age_role <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, Age, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Age by Job Role, n = ", count(df_emp))) + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 1))
ggplotly(box_plot_age_role)
# Box plot years in current role by job role
box_plot_role_yicr <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, YearsInCurrentRole, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Years in current role by Job Role, n = ", count(df_emp))) + xlab("Job Role") + ylab("Years in current role") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5)) 
ggplotly(box_plot_role_yicr) 
# Bar plot job role by gender
bar_plot_dodge_role_gender <- merge(df_job, df_emp) %>% ggplot(aes(JobRole, fill = Gender)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Job Roles by Gender,  n = ", count(df))) + xlab("Job Role") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5))
ggplotly(bar_plot_dodge_role_gender)
# Box plot of monthly income by job role
box_plot_income_role <- df %>% ggplot(aes(JobRole, MonthlyIncome, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Monthly Income by Job Role, n = ", count(df))) + xlab("Job Role") + ylab("Monthly Income (USD)") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5)) 
ggplotly(box_plot_income_role) 
# Bar plot of job satisfaction by job role
bar_plot_dodge_job_sat <- df %>% ggplot(aes(JobSatisfaction, fill = JobRole)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Job Satisfaction by Job Roles,  n = ", count(df))) + xlab("Job Satisfaction") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 1))
ggplotly(bar_plot_dodge_job_sat)
# Histogram of monthly income
hist_wrap_income_role <- df %>% ggplot(aes(MonthlyIncome, fill = JobRole)) + geom_histogram(binwidth = 200) + ggtitle(paste("Histogram of Monthly Income by Job Role, n = ", count(df))) + xlab("Monthly Income (USD)") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0, hjust = 0.1)) + facet_wrap(~JobRole)
ggplotly(hist_wrap_income_role)
# Histogram of job satisfaction by job role
hist_wrap_income_sat <- df %>% ggplot(aes(MonthlyIncome, fill = JobRole, position = "stack")) + geom_histogram(binwidth = 200) + ggtitle(paste("Histogram of Monthly Income by Job Satisfaction, n = ", count(df))) + xlab("Monthly Income (USD)") + ylab("Count") + facet_wrap(~JobSatisfaction)
ggplotly(hist_wrap_income_sat)
### Attritional Factors by Variables ###

# Bar plot of attrition by job role
hist_att_role <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, fill = Attrition)) + geom_bar(position = "stack") + ggtitle(paste("Bar plot of Attrition by Job Role, n = ", count(df_emp))) + xlab("Job Role") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0, hjust = 1))
ggplotly(hist_att_role)
# Bar plot of attrition v. gender by job role
hist_wrap_role_gender <- merge(df_emp, df_job) %>% ggplot(aes(Attrition, fill = Gender)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Attrition by Gender, n = ", count(df_emp))) + xlab("Attrition") + ylab("Count") + facet_wrap(~JobRole)
ggplotly(hist_wrap_role_gender)
# PercentHike v. Years In Current Role
plot_wrap_psh_yslp <- df_cs2 %>% ggplot(aes(YearsSinceLastPromotion, PercentSalaryHike)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_psh_yslp)
# Scatter plot years since last promotion v. years in current role
plot_wrap_yslp_yicr <- df_cs2 %>% ggplot(aes(YearsSinceLastPromotion, YearsInCurrentRole)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_yslp_yicr)
# *** Scatter plot years with current manager v. years in current role ***
plot_wrap_ywcm_yicr <- df_cs2 %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_ywcm_yicr)
# Scatter plot years with current manager v. job satisfaction
plot_wrap_ywcm_js <- df_cs2 %>% ggplot(aes(YearsWithCurrManager, JobSatisfaction)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_ywcm_js)

Model the Case Study 2 data frame

KNN Classification

#source("analysis/knn.R")

# Classification - attrition by leadership management
set.seed(6)
split_percent <- 0.7

l_train <- df_cs2 %>% 
  select(Attrition, YearsWithCurrManager, YearsInCurrentRole)

trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]

scatter_smooth_ywcm_yicr <- l_train %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Years with current manager v. Years in current role, n = ", count(l_train)))
ggplotly(scatter_smooth_ywcm_yicr)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##               
## classification  No Yes
##            No  216  43
##            Yes   2   0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
## 
##               
## classification  No Yes
##            No  216  43
##            Yes   2   0
##                                           
##                Accuracy : 0.8276          
##                  95% CI : (0.7762, 0.8714)
##     No Information Rate : 0.8352          
##     P-Value [Acc > NIR] : 0.6673          
##                                           
##                   Kappa : -0.0149         
##                                           
##  Mcnemar's Test P-Value : 2.479e-09       
##                                           
##             Sensitivity : 0.9908          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8340          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.8352          
##          Detection Rate : 0.8276          
##    Detection Prevalence : 0.9923          
##       Balanced Accuracy : 0.4954          
##                                           
##        'Positive' Class : No              
## 
# Classification - attrition by time duration
set.seed(6)
split_percent <- 0.7

l_train <- df_cs2 %>% 
  select(Attrition, YearsAtCompany, YearsInCurrentRole)

trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]

scatter_smooth_yac_yicr <- l_train %>% ggplot(aes(YearsAtCompany, YearsInCurrentRole, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Years at company v. years in current role, n = ", count(l_train)))
ggplotly(scatter_smooth_yac_yicr)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##               
## classification  No Yes
##            No  218  43
##            Yes   0   0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
## 
##               
## classification  No Yes
##            No  218  43
##            Yes   0   0
##                                           
##                Accuracy : 0.8352          
##                  95% CI : (0.7846, 0.8781)
##     No Information Rate : 0.8352          
##     P-Value [Acc > NIR] : 0.5406          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8352          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8352          
##          Detection Rate : 0.8352          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
## 
# Classification - attrition by peer relationship
set.seed(6)
split_percent <- 0.7

l_train <- df_cs2 %>% 
  select(Attrition, RelationshipSatisfaction, YearsWithCurrManager)

trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]

scatter_smooth_rel_yicr <- l_train %>% ggplot(aes(RelationshipSatisfaction, YearsWithCurrManager, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Number of companies v. years at company, n = ", count(l_train)))

classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##               
## classification  No Yes
##            No  218  43
##            Yes   0   0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
## 
##               
## classification  No Yes
##            No  218  43
##            Yes   0   0
##                                           
##                Accuracy : 0.8352          
##                  95% CI : (0.7846, 0.8781)
##     No Information Rate : 0.8352          
##     P-Value [Acc > NIR] : 0.5406          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8352          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8352          
##          Detection Rate : 0.8352          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
## 

Naive Bayes Classification

#source("analysis/nb.R")

# Continuous predictor of attrition by leadership
df_attr <- df_cs2 %>%
  select(ID, Attrition, YearsWithCurrManager, YearsInCurrentRole) %>%
  mutate(ID = as.factor(ID), YearsInCurrentRole = as.factor(YearsInCurrentRole), YearsWithCurrManager = as.factor(YearsWithCurrManager))

df_attr %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole)) + geom_point(aes(color = Attrition), position = "jitter", size = 0.3) + geom_smooth(aes(color = Attrition), size = 0.3)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

iterations = 100
m_acc <- matrix(nrow = iterations)
split_percent <- 0.7
for(i in 1:iterations) {
  train_indices = sample(1:dim(df_attr)[1], round(split_percent * dim(df_attr)[1]))
  train = df_attr[train_indices,]
  test = df_attr[-train_indices,]
  
  model = naiveBayes(train[,3:4], train$Attrition)
  table(predict(model, test[,3:4]), test$Attrition)
  cm <- confusionMatrix(table(predict(model,test[,3:4]), test$Attrition))
  m_acc[i] <- cm$overall[1]
}

mean_acc <- colMeans(m_acc)
mean_acc
## [1] 0.8044828

Linear Regression Analysis

#source("analysis/lm.R")

# Linear Model Monthly Income v. Monthly Rate
fit <- lm(MonthlyIncome~HourlyRate, data = df_pay)

df_pay %>% ggplot(aes(MonthlyIncome, MonthlyRate)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

beta_0_hat <- fit$coefficients[1]
beta_1_hat <- fit$coefficients[2]

SE_beta_0_hat <- summary(fit)$coefficients[1,2]
SE_beta_1_hat <- summary(fit)$coefficients[2,2]

# Intercept
tstat_int <- beta_0_hat / SE_beta_0_hat
pvalue_int <- (1-pt(tstat_int, length(df_pay$MonthlyIncome)-2)) * 2
tstat_int
## (Intercept) 
##    11.94203
pvalue_int
## (Intercept) 
##           0
# Slope
tstat_slope <- beta_1_hat / SE_beta_1_hat
pvalue_slope <- (pt(tstat_slope, length(df_pay)-2)) * 2
tstat_slope
## HourlyRate 
##  0.0704479
pvalue_slope
## HourlyRate 
##   1.054192
summary(fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ HourlyRate, data = df_pay)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5322  -3539  -1444   1779  13609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6354.4250   532.1060   11.94   <2e-16 ***
## HourlyRate     0.5462     7.7535    0.07    0.944    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4600 on 868 degrees of freedom
## Multiple R-squared:  5.718e-06,  Adjusted R-squared:  -0.001146 
## F-statistic: 0.004963 on 1 and 868 DF,  p-value: 0.9439
confint(fit)
##                  2.5 %     97.5 %
## (Intercept) 5310.06024 7398.78985
## HourlyRate   -14.67154   15.76397
# Welch Two Sample t-test
t.test(df_pay$MonthlyIncome, df_pay$MonthlyRate)
## 
##  Welch Two Sample t-test
## 
## data:  df_pay$MonthlyIncome and df_pay$MonthlyRate
## t = -27.648, df = 1487.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8498.351 -7372.362
## sample estimates:
## mean of x mean of y 
##  6390.264 14325.621
# Conduct Hypothesis Test

# LOOCV
pred_error_sq <- c(0)
for(i in 1:dim(df_pay)[1]) {
  loocv_i <- df_pay[-i,]
  fit <- lm(MonthlyIncome ~ MonthlyRate, data = loocv_i)
  pred_i <- predict(fit, data.frame(MonthlyRate = df_pay[i,4]))
  pred_error_sq <- pred_error_sq + (df_pay[i,4] - pred_i)^2
}
SSE <- var(df_pay$MonthlyIncome)
R_squared <-  1 - (pred_error_sq/SSE)
MSE <- pred_error_sq / length(df_pay)
RMSE <- sqrt(pred_error_sq/length(df_pay))
RMSE
##        1 
## 102794.5